Slavko Zitnik, March 4, 2021
Code is adapted from Google News and Leo Tolstoy: Visualizing Word2Vec Word Embeddings using t-SNE by Sergey Smetanin.
We use Google's Word2Vec vectors from https://code.google.com/archive/p/word2vec/.
We are publishing pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described by Mikolov et al., 2013.
Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013.
import gensim
import gensim.downloader
from sklearn.manifold import TSNE
import numpy as np
import tempfile
import imageio
import shutil
import os
from statistics import mean
import pandas as pd
from IPython.display import Image
from IPython.display import display
pd.options.display.max_columns = None
import matplotlib.patheffects as PathEffects
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
model_gn = gensim.downloader.load('word2vec-google-news-300')
keys = [ \
"profane",
"slur",
"derailing",
"harrasment",
"stereotype",
"spam",
"obscene",
"dominance",
"threat",
"discredit",
"hateful", # hate-speech not in dictionary, should be taken into account together
"insult",
"benevolent",
"hostile"
]
embedding_clusters = []
word_clusters = []
for word in keys:
embeddings = []
words = []
for similar_word, _ in model_gn.most_similar(word, topn=30):
words.append(similar_word)
embeddings.append(model_gn[similar_word])
embedding_clusters.append(embeddings)
word_clusters.append(words)
Top 30 neighbouring words based on key words.
df = pd.DataFrame(dict(zip(keys, word_clusters)))
display(df)
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)
def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
plt.figure(figsize=(16, 9))
colors = cm.rainbow(np.linspace(0, 1, len(labels)))
for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
x = embeddings[:, 0]
y = embeddings[:, 1]
plt.scatter(x, y, c=color, alpha=a, label=label)
for i, word in enumerate(words):
plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
textcoords='offset points', ha='right', va='bottom', size=8)
plt.annotate(label.upper(), alpha=1.0, xy=(mean(x), mean(y)), xytext=(0, 0),
textcoords='offset points', ha='center', va='center', size=15)
plt.legend(loc=4)
plt.title(title)
plt.grid(False)
if filename:
plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
plt.show()
tsne_plot_similar_words('Similar words from Google News', keys, embeddings_en_2d, word_clusters, 0.7,
'similar_words.png')
words = keys
embeddings = []
for word in words:
embeddings.append(model_gn[word])
tsne_2d = TSNE(perplexity=30, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_2d = tsne_2d.fit_transform(embeddings)
def tsne_plot_2d(label, embeddings, words=[], a=1):
plt.figure(figsize=(16, 9))
colors = cm.rainbow(np.linspace(0, 1, 1))
x = embeddings[:,0]
y = embeddings[:,1]
plt.scatter(x, y, c=colors, alpha=a, label=label)
for i, word in enumerate(words):
plt.annotate(word, alpha=0.3, xy=(x[i], y[i]), xytext=(5, 2),
textcoords='offset points', ha='right', va='bottom', size=10)
plt.legend(loc=4)
plt.grid(True)
plt.savefig("hhh.png", format='png', dpi=150, bbox_inches='tight')
plt.show()
tsne_plot_2d('Keywords 2D embeddings visualization', embeddings_2d, words, a=1)
We take top 200 closest neighours and visualize multiple t-SNE visualizations.
keys_gif = keys
embedding_clusters_gif = []
word_clusters_gif = []
for word in keys:
embeddings = []
words = []
for similar_word, _ in model_gn.most_similar(word, topn=200):
words.append(similar_word)
embeddings.append(model_gn[similar_word])
embedding_clusters_gif.append(embeddings)
word_clusters_gif.append(words)
embedding_clusters_gif = np.array(embedding_clusters_gif)
n, m, k = embedding_clusters_gif.shape
def tsne_plot_similar_words_png(title, embedding_clusters, keys, a, filename):
plt.figure(figsize=(16, 9))
colors = cm.rainbow(np.linspace(0, 1, len(embedding_clusters)))
for embeddings, color, key in zip(embedding_clusters, colors, keys):
x = embeddings[:, 0]
y = embeddings[:, 1]
plt.scatter(x, y, c=color, alpha=a)
plt.text(x.mean(), y.mean(), key.upper(), color='white', weight='bold', fontsize=13, path_effects=[PathEffects.withStroke(linewidth=3,
foreground="black", alpha=0.7)])
plt.title(title)
plt.grid(True)
plt.xlim(-200, 200)
plt.ylim(-200, 200)
plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
dirpath = tempfile.mkdtemp()
images = []
for i in range(1, 30):
fname = os.path.join(dirpath, str(i) + '.png')
tsne_model_en_2d_gif = TSNE(perplexity=i, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d_gif = np.array(tsne_model_en_2d_gif.fit_transform(embedding_clusters_gif.reshape(n * m, k))).reshape(n, m, 2)
tsne_plot_similar_words_png('Vizualizing similar words from Google News using t-SNE (perplexity={})'.format(i), embeddings_en_2d_gif, keys_gif, 0.6, fname)
images.append(imageio.imread(fname))
print(f"Finished iteration {i}.")
imageio.mimsave("2d1.gif", images, duration = 0.5)
shutil.rmtree(dirpath)
Image(filename="2d1.gif")